/*

01a_app_use.do
(this do file is called from 01_make_data.do)

Purpose: discussed below
Inputs: changelog2019, changelog2020, snapshot2019, snapshot2020, applications,
	ratex2019, ratex2020, randomizations, emails, students_update
Outputs: app_use_changelog_2019_long, app_use_changelog_2020_long, 
	app_use_snapshot_2019_long, app_use_snapshot_2020_long,
	app_use_changelog_2019_long_new, app_use_changelog_2020_long_new, 
	app_use_snapshot_2019_long_new, app_use_snapshot_2020_long_new,
	app_use_changelog_2019_new, app_use_changelog_2020_new, 
	app_use_snapshot_2019_new, app_use_snapshot_2020_new, app_use_panel_new,
	app-use-student-level
	
*/

	/*

		Create a student level file documenting changes students make to their
		smartchoice applicaton portfolio.
		
		We do this starting from three datasets:
			- Daily Snapshots of the application databank in 2020 (apps_change_panel_2020.csv)
			- Daily app portfolio states recovered from the changelog for 2019 and 2020 ( apps_changelog_`year'.csv)
	
		All these datasets are at student x date x rank and reflect portfolio states with  
		information on choices, priority, market, ratex and grade.
		
		Steps of processing:
			1) Dates and Sample:
				- Only non-deleted applications (by admin)
				- Generate dates and timestamps (snapshot,first_date,date_ts)
				- Restrict sample based on date and first submission
			2) Fix Ranks
				- Min Rank > 1, Rank Gaps, duplicate ranks by inspection of 
					Smartchoice server
			3) Merge on treatment status and timing
				- Condition on observating pre-randomization state of portfolio
				
			4) Document change behavior
				- Outcomes:
					* Type: Ever, final vs. initial
					* Objects: School choices, length, risk, positioning of new choices
		
		Sample restrictions:
			- Appid NOT deleted by admin 
			<=> appid can be found in final smartchoice applications file
			- 7 Day Time window preceding application deadline
			- Only students who have submitted an application AT LEAST 6 days
			before the deadline
			- In 2020: Only students for whome we observe choices that reflect state
				of their portfolio BEFORE they were randomized into treatments
				with certainty. This specifically drops students who have submitted
				their application between Feb 25 7pm and Feb26 7pm AND have been randomized
				before 7pm on the 26th of February. 
			
	*/

* Application deadlines
global app_deadline_2020 = "2020-03-02"
global app_deadline_2019 = "2019-03-17"

* File locations
global changelog2019 = "$sim/apps_changelog_2019.csv"
global changelog2020 = "$sim/apps_changelog_2020.csv"
global snapshot2020 = "$sim/apps_change_panel_2020.csv"

* Priority/ratex types 
global risk_types = "sim final real"

global app_change_outcomes = "*ever *final *initial *diff initial* new*" 

* Process data by year and type of raw data
foreach year in 2019 2020 {
	foreach data in "changelog" "snapshot" {
		cap insheet using "${`data'`year'}",clear
		* if file doesn't consist skip the processing
		if _rc != 0 {
			continue
		}
		
		* Subset on relevant grades
		keep if grade <= 0 | grade == 9  
		
		gen year = `year'
		gen data = "`data'"
		
		* Subset on applications that have NOT been deleted by admins and therefore
		* appear in the final datasets
		preserve
			insheet using "${smartchoice_final`year'}/applications.csv", clear
			keep appid studentid
			tempfile final
			save `final', replace
		restore
		
		// In very rare instances, studentids change. Impute from final applications
		merge m:1 appid using `final', keepusing(studentid) update replace keep(1 3 4 5) 
		tab _merge 
		
		// Appids that do not appear in the final applications have been deleted by admins
		// We don't consider them in the analysis, as they would not appear in the 
		// changelog data in 2019 either
		keep if _merge >= 3
		drop _merge
	
		// Drop applications that have been withdrawn in snapshot data
		drop if withdrawn == 1
		
		* Manage priority and ratex variables in different datasources
		/*
			1) priority_sim, ratex_sim: 
				- 2020 only
				- priority as used to assign treatment and show simulator information
				- Ratex from 2019 dayof Ratex
			2) priority_final,ratex_final: 
				- priority as run in the lottery (final)
				- Ratex from 2019 dayof ratex
				- missing for appids that have been withdrawn before the deadline
				- Missing for suburban 9th grade applicants in 2020 because of new priorities
			3) priority_real, ratex_real:
				- priority as run in the lottery (final)
				- Ratex taken from actual dayof placement chances in same year
				- missing for appids that have been withdrawn before the deadline
		*/
		
		if "`data'" == "changelog" {
			ren priority priority_final
			gen priority_sim = . 
			ren ratex ratex_final
			gen ratex_sim = .
		}
		gen priority_real = priority_final

		////////////////////////// REALIZED RATEX //////////////////////////////
		* Merge on the realized ratex chance as the dayof placement chance the 
		* application would have had in the same year. 
		
		merge m:1 grade academyid priority_real lottery_group ///
			using "$int/ratex`year'", ///
			keep(1 3) keepusing(ratex_real) gen(_mratexreal)
		
		* assert _mratexreal == 3 // Make sure were missing no ratex
		drop _mratexreal
		
		// Fix ratex of nbhd schools in 9th grades
		foreach type in $risk_types {
			replace ratex_`type' = 1 if priority_`type' == 2 
		}
		
		//////////////////////////////// DATES /////////////////////////////////
		gen double date_ts = date(date, "YMD")
		format date_ts %td
		
		/// Timestamp variable (7pm, time of data pull and changelog date margin)
		gen app_snapshot_ts = date + " 19:00:00" 

		* Convert Snapshot Timestamp to Clock variable
		gen double app_snapshot_tsx = clock(app_snapshot_ts,"YMD hms")
		format app_snapshot_tsx %tc
		drop app_snapshot_ts 
		ren app_snapshot_tsx app_snapshot_ts
		
		/// Make full panel for 7 day window before app deadline
		gen double deadline = date("${app_deadline_`year'}","YMD")
		format deadline %td
	
		////////////////////// SAMPLE RESTRICTIONS /////////////////////////////
		
		* Restrict sample to consist of applicants who initially submitted at 
		* least 6 days prior to the deadline (i.e. when we finished treating in 2020)
		bys studentid: egen double first_date = min(date_ts)
		format first_date %td
		drop if deadline - first_date < 5  

		* Drop appids that have been first submitted after last snapshot before deadline
		bys appid: egen double first_app_date = min(date_ts)
		format first_app_date %td
		drop if first_app_date > deadline
		
		// Only keep the portfolio states of the last 7 days prior to the deadline
		drop if date_ts > deadline
		drop if deadline - date_ts >= 7
		
		//////////////////////// FIX RANKS /////////////////////////////////////
		
		/// Check for valid ranks
		* In 2019 applicants can have up to 4 choices (5 for 9th graders from NH)
		* In 2020 applicants can have up to 6 choices (7 for 9th graders from NH)
		gen valid_rank = rank <= 4 | ///
			(rank <= 5 & grade == 9 & lottery_group == 0 ) | ///
			(rank <= 6 & year == 2020) | ///
			(rank <=7 & year == 2020 & grade >=9 & lottery_group == 0)
		cap noisily assert valid_rank == 1
		* There are a few observations that have an invalidly high rank,
		* but we have verified that these applications have actually been run
		* like this, despite not being valid
		
		/// Fix rank gaps and wrong rank mins
		bys studentid date_ts: egen rank_fix = rank(rank) , track
		replace rank = rank_fix
		drop rank_fix
		
		// Check that all portfolios start with rank 1
		bys studentid date_ts: egen rank_min = min(rank)
		assert rank_min == 1
		
		/// Check for rank duplicates 
		duplicates tag studentid date_ts rank, gen(dup)
			
		// For some students the default school has an incorrect rank, i.e. is not ranked last
		* For duplicates of student x date_ts x rank, increase the rank of the default schools by 1
		replace rank = rank + 1 if priority_sim == 2 & dup == 1 
		
		cap drop dup
		duplicates tag studentid date_ts rank, gen(dup)
		assert dup == 0 
		
		/// Assert that als rank order inconsistencies have been removed
		bys studentid date_ts: egen rank_max = max(rank)
		bys studentid date_ts: egen rank_sum = total(rank) 
		replace rank_sum = 0 if mi(rank_sum)
		
		qui su rank
		local maxrank = `r(max)'
		gen exp_rank_sum = 0
		forv k = 1/`maxrank'{
			replace exp_rank_sum = exp_rank_sum + `k' * (rank_max >= `k')
		}

		gen inconsistent_ranks = rank_sum != exp_rank_sum
		assert inconsistent_ranks == 0 
		
		// Subset on relevant variables
		keep studentid appid date date_ts grade rank academyid priority* lottery_group ///
			ratex* year data app_snapshot_ts 

		* App Length: Number of choices except default choices
		bys studentid date_ts: egen app_lengthx = count(rank) if priority_final != 2
		bys studentid date_ts: egen app_length = mean(app_lengthx)
		drop app_lengthx
		
		//////////////// Treatment and Randomization ///////////////////////////		
		/// Merge treatment status in 2020
		merge m:1 studentid year using "$int/randomizations", ///
			keep(1 3) keepusing(block treat rand_ts) nogen

		/// Merge email status
		merge m:1 studentid year using "$int/emails", ///
			keep(1 3) nogen keepusing(email_ts)
		
		gen treated = rand_ts < app_snapshot_ts ///
			if !mi(rand_ts) & !mi(app_snapshot_ts) & year == 2020

		* Check whether we observe the applicant before any treatment was received
		* Note: This is always true for applicants that are never treated
		bys studentid: egen any_baseline = min(treated)
		replace any_baseline = any_baseline == 0 | mi(treat)
		
		* Get indicators for initial and final portfolio
		bys studentid: egen initial_date = min(date_ts) 
		bys studentid: egen final_date = max(date_ts) 
		
		format initial_date %td
		format final_date %td

		* Get initial and final portfolio
		cap drop rank_*
		gen rank_initialx = rank if date_ts == initial_date
		bys studentid academyid: egen rank_initial = mean(rank_initialx)
		
		gen rank_finalx = rank if date_ts == final_date
		bys studentid academyid: egen rank_final = mean(rank_finalx)
	
		gen diff = rank_initial != rank_final
		drop rank_*x

		save "$int/app_use_`data'_`year'_long", replace
		
		merge m:1 studentid year using "$int/students_update", ///
			keep(1 3) keepusing(resident) nogen
		
		gen default_school_indicator = ///
			priority_sim == 2 & grade == 9 & (resident == 1 | mi(resident))
		bys studentid: egen applen_initialx = max(rank_initial) if default_school_indicator == 0 
		bys studentid: egen applen_initial = mean(applen_initialx)
		bys studentid: egen applen_finalx = max(rank_final) if default_school_indicator == 0 
		bys studentid: egen applen_final = mean(applen_finalx)
		
		drop applen_*x
		
		* Full list
		foreach state in "initial" "final" {
			if `year' == 2019 {
				gen fullapp_`state' = applen_`state' == 4
			}
			if `year' == 2020 {
				gen fullapp_`state' = applen_`state' == 6
			}
		}
		
		save "$int/app_use_`data'_`year'_long_new", replace
		
		drop rank_* applen_* fullapp_* default_school_indicator diff
		
		///////////// Bring dataset to student x date_ts level /////////////////
		sort studentid date_ts rank	
		
		foreach var of varlist academyid *_sim *_final *_real lottery_group {
			ren `var' `var'_
		}
		
		drop appid
		qui su rank
		local rankmax = `r(max)'
		reshape wide *_sim_ *_final_ *_real_ lottery_group_ academyid_ , ///
			i(studentid date year data) j(rank)

		sort grade studentid date_ts

		//////////////////////// App use outcomes //////////////////////////////	

		/// 
		/// Ratex risk level by date based on the three risk definitions
		///
		/* 
			Note: to make the ratex_risk equivalent to the app_risk, we include
			the probability to be placed at the default school in this measure.
			All applicants with a default school (resident 9th grade applicants)
			will thus have a ratex_risk of 0 regardless of their portfolio
		*/
		
		foreach type in $risk_types {
			// Placement chance
			cap drop pr_placed_`type'
			gen pr_placed_`type' = 0 
			
			forv k = 1/`rankmax' {		
				replace pr_placed_`type' = pr_placed_`type' + ///
					(1 - pr_placed_`type') *ratex_`type'_`k' ///
					if !mi(ratex_`type'_`k') 
			}
			
			// Risk of not being placed
			cap drop ratex_risk_`type'
			gen ratex_risk_`type' = 1 - pr_placed_`type' if !mi(pr_placed_`type')
			
		}
		
		* If all ratex_final are missing, set the placement chance and risk to missing 
		egen max_ratex_final = rowmax(ratex_final_?)
		replace pr_placed_final = . if mi(max_ratex_final)
		replace ratex_risk_final = . if mi(max_ratex_final)
		cap drop max_ratex_final 
		
		* Fix use final placement chances and ratex risk as simulated for changelog records
		replace pr_placed_sim = pr_placed_final if data == "changelog"
		replace ratex_risk_sim = ratex_risk_final if data == "changelog"

		///
		/// Changes in ratex risk
		///
		
		foreach type in $risk_types {
			* Initial and final app risk
			cap drop risk_`type'_initial*
			bys studentid treated (date_ts): ///
				gen risk_`type'_initialx = ratex_risk_`type'[_N] if treated == 0 
			bys studentid (date_ts): ////
				replace risk_`type'_initialx = ratex_risk_`type'[1] if mi(treat)
			bys studentid: egen risk_`type'_initial = mean(risk_`type'_initialx)
			drop risk_`type'_initialx

			bys studentid (date_ts): gen risk_`type'_final = ratex_risk_`type'[_N]

			gen risk_`type'_diff = risk_`type'_final - risk_`type'_initial
		}
		
		///
		/// Changes in application length
		///

		* Changing app length over time
		cap drop lengthen_app
		bys studentid (date_ts): gen lengthen_app = ///
			app_length > app_length[_n - 1] 

		cap drop shorten_app
		bys studentid (date_ts): gen shorten_app = ///
			app_length < app_length[_n - 1] 
			
		* Only count changes after treatment was received 
		* (or after the first recorded date_ts for studens in the control group)
		bys studentid (date_ts): replace lengthen_app = 0 ///
			if treated == 0 | _n == 1
			
		bys studentid (date_ts): replace shorten_app = 0 ///
			if treated == 0 | _n == 1

		* Changing app length from first to last app
		* Initial app length from the last application before begin treated 
		bys studentid treated (date_ts): ///
			gen app_length_initialx = app_length[_N] if treated == 0 
		* Corresponds to the first recorded date_ts if never treated
		bys studentid (date_ts): ///
			replace app_length_initialx = app_length[1] if mi(treat)

		bys studentid: egen app_length_initial = mean(app_length_initialx)
		drop app_length_initialx

		* Final outcomes
		cap drop lengthen_app_final
		bys studentid (date_ts): ///
			gen lengthen_app_final = app_length[_N] > app_length_initial
		cap drop shorten_app_final
		bys studentid (date_ts): ///
			gen shorten_app_final = app_length[_N] < app_length_initial

		* Difference
		cap drop len_diff_final
		bys studentid (date_ts): gen len_diff_final = ///
			app_length[_N] - app_length_initial

		* Any time outcomes
		bys studentid: egen lengthen_app_ever = max(lengthen_app)
		bys studentid: egen shorten_app_ever = max(shorten_app)
		
		///
		/// Change school choices at fixed application length
		///
		
		* Change any listed schools between days BUT not app length
		global any_change_school_lines = ""
		forv k = 1/`rankmax' {
			global any_change_school_lines = "$any_change_school_lines" + ///
				"academyid_`k' != academyid_`k'[_n - 1] & !mi(academyid_`k') & !mi(academyid_`k'[_n - 1])"
			if `k' < `rankmax' {
				global any_change_school_lines = "$any_change_school_lines | "
			}
		}
		
		cap drop change_school
		bys studentid (date_ts): gen change_school = $any_change_school_lines 

		* Only count changes after treatment was received 
		* (or after the first recorded date_ts for studens in the control group)
		bys studentid (date_ts): replace change_school = 0 ///
			if treated == 0 | _n == 1

		* Change listed schools by rank
		forv k = 1/`rankmax' {
			bys studentid (date_ts): ///
				gen change_r`k' = academyid_`k' != academyid_`k'[_n - 1] ///
					if !mi(academyid_`k') & !mi(academyid_`k'[_n - 1])
			
			// Get initial choices of a student as last date_ts before begin treated
			bys studentid treated (date_ts): ///
				gen initial_rank`k'x = academyid_`k'[_N] if treated == 0 
			// For those never treated, this corresponds to the first date_ts overall
			bys studentid (date_ts): ///
				replace initial_rank`k'x = academyid_`k'[1] if mi(treat)
			bys studentid: ///
				egen initial_academyid_`k' = mean(initial_rank`k'x)
			drop initial_rank`k'x
			
			replace change_r`k' = 0 ///
				if treated == 0 | _n == 1
		}

		* Any change of listed schools over the whole time window
		bys studentid: egen change_school_ever  = max(change_school)

		* Change of any school between final and initial portfolio BUT not length
		global final_change_school_lines = ""
		forv k = 1/`rankmax' {
			global final_change_school_lines = "$final_change_school_lines" + ///
				"initial_academyid_`k' != academyid_`k'[_N] & " + ///
				"!mi(initial_academyid_`k') & !mi(academyid_`k'[_N])"
			if `k' < `rankmax' {
				global final_change_school_lines = "$final_change_school_lines | "
			}
		}
		bys studentid (date_ts): gen change_school_final = $final_change_school_lines
		
		///
		/// Position of new added schools
		///
		
		/*
			Two different positions are considered:
				- insert: a new school ranked higher than at least one initial choice
				- append: a new school that is ranked lower than all initial choices
		
		*/
		
		* Create indicators for every choice on any date whether it is new 
		* relative to the initial portfolio
		
		forv new = 1/`rankmax' {
			gen new_school`new' = 1 if !mi(academyid_`new')
			
			// Compare current choice to initial portfolio
			forv old = 1/`rankmax' {
				replace new_school`new' = 0 if academyid_`new' == initial_academyid_`old' ///
					& !mi(academyid_`new')
				
			}
			replace new_school`new' = . if mi(academyid_`new')
		}
		
		* Any new school on any rank
		egen new_school_any = rowmax(new_school?)
		
		* Max. rank of choices on initial application
		gen max_rank_initial_choices = 1
		forv old = 1/`rankmax' {
			cap drop on_any_app
			gen on_any_app = 0 if !mi(initial_academyid_`old')
			forv new = 1/`rankmax' {
				cap drop on_app
				gen on_app = 0 if !mi(initial_academyid_`old')
				cap drop new
				gen new = `new'
				replace on_app = 1 if initial_academyid_`old' == academyid_`new' ///
					& !mi(initial_academyid_`old')
				replace on_any_app = 1 if on_app == 1
				replace max_rank_initial_choices = new if ///
					new > max_rank_initial_choices & on_app == 1				
			} // new
			replace max_rank_initial_choices = 99 if on_any_app == 0 
			
		} //old
		drop on_* new
		
		* New school added before at least one school from initial application
		gen min_rank_insert = 100 if new_school_any == 0
		forv k = 1/`rankmax' {
			replace min_rank_insert = `k' if new_school`k' == 1 & `k' < min_rank_insert	
		}
		
		gen new_insert = new_school_any == 1 ///
			& min_rank_insert < max_rank_initial_choices
		
		* New school added below any school from the initial portfolio
		gen new_append = new_school_any & ///
			min_rank_insert > max_rank_initial_choices
		
		* Get student-level variables - ever new school
		bys studentid: egen new_append_ever = max(new_append)
		bys studentid: egen new_insert_ever = max(new_insert)
		
		* Final vs initial
		bys studentid (date_ts): gen new_append_final = new_append[_N]
		bys studentid (date_ts): gen new_insert_final = new_insert[_N]
		
		drop min_rank* max_rank*
			
		sort grade studentid date_ts

		gen modify_ever = change_school_ever == 1 | lengthen_app_ever == 1 | shorten_app_ever == 1
		gen modify_final = change_school_final == 1 | lengthen_app_final == 1 | shorten_app_final == 1

		/// Labeling

		label var app_length "Length of of last application on a given date"
		label var change_school "Any different school listed compared to portfolio of previous day"
		label var lengthen_app "Increase in the applicaton length compared to previous day"
		label var shorten_app "Decrease in the applicaton length compared to previous day"

		label var lengthen_app_final "Lengthen app. (final)"
		label var lengthen_app_ever "Lengthen app. (anytime)"
		label var shorten_app_ever "Shorten app. (any time)"
		label var shorten_app_final "Shorten app. (final)"
		label var len_diff_final "Absolute app. length difference (final)"

		label var change_school_final "Change school (final)"
		label var change_school_ever "Change school (anytime)"
		
		label var new_insert_ever "Insert new school (anytime)"
		label var new_append_ever "Append new school (anytime)"
		label var new_insert_final "Insert new school (final)"
		label var new_append_final "Append new school (final)"
		
		label var modify_ever "Change length or school (any time)"
		label var modify_final "Change length or school (final)"
		
		label var risk_sim_diff "Difference in simulated risk"
		label var risk_real_diff "Difference in realized risk"
		label var risk_final_diff "Difference in simulated final risk"
		
		// Set outcomes to missing, if we don't observe any baseline state of portfolio
		foreach outcome of varlist $app_change_outcomes {
			
			replace `outcome' = . if any_baseline == 0 
		}
		save "$int/app_use_`data'_`year'_new", replace
	}
}

//////////////////////// Make on long app panel over dates ///////////////////

clear
append using "$int/app_use_changelog_2019_new"
append using "$int/app_use_snapshot_2020_new"

save "$int/app_use_panel_new",replace

//////////////////////// Make student level file ///////////////////////////

use "$int/app_use_panel_new", clear
drop date* *1 *2 *3 *4 *5 *6 *7 app_length change_school lengthen_app ///
	shorten_app treated *ts ratex_risk* pr_placed* new_school* new_insert new_append

duplicates drop 
isid studentid year data

save "$int/app-use-student-level", replace